{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "import path_utils  # ai4eutils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# awc_202103"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'awc_202103'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/awc/'\n",
    "path_prefix = '202103drop/'\n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database\n",
    "\n",
    "Folder structure for labeled folders (no location info, guessed sequence):\n",
    "\n",
    "`Reptiles`\n",
    "\n",
    "species - region/IMG_0463.JPG - can guess sequence from when the number in image name breaks, but sometimes there are consecutive sequences that we can't separate.\n",
    "\n",
    "`MZT reptile pics`\n",
    "\n",
    "species / 10_IMG_0779.JPG - same thing for sequence info.\n",
    "\n",
    "\n",
    "Folder structure for unlabeled folders (has location info but no sequence info), quite empty:\n",
    "\n",
    "`Bullo Rocky Cams Nov19-Feb20`\n",
    "\n",
    "location / RCNX0001.JPG\n",
    "\n",
    "\n",
    "`Treatment_R1`\n",
    "\n",
    "FM96 - MG102 (looks like both components are unique - location) / 3.1.20 (date - is different from the one in the header of the image though) / RCNX0001.JPG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "494380"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "494332"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "folder = os.path.join(container_root, path_prefix)\n",
    "\n",
    "paths = path_utils.recursive_file_list(folder)\n",
    "len(paths)\n",
    "paths = sorted([p.split(folder)[1] for p in paths if path_utils.is_image_file(p) and not os.path.basename(p).startswith('.')])\n",
    "len(paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Bullo Rocky Cams Nov19-Feb20/HP212/RCNX1001.JPG'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'Treatment_R1/FM99 - MG144/10.2.20/RCNX0061.JPG'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paths[10]\n",
    "paths[-10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Labeled folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 494332/494332 [00:00<00:00, 1659085.20it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "712"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labeled_folders = ('Reptiles', 'MZT reptile pics')\n",
    "\n",
    "images_labeled = []\n",
    "species = set()\n",
    "\n",
    "last_frame_num = 0\n",
    "seq_count = 0\n",
    "frame_in_seq = 1\n",
    "\n",
    "for p in tqdm(paths):\n",
    "    if not p.startswith(labeled_folders):\n",
    "        continue\n",
    "    \n",
    "    p_parts = p.split('/')\n",
    "    \n",
    "    clss = p_parts[1].split('-')[0].strip().lower()\n",
    "    species.add(clss)\n",
    "    \n",
    "    fn = p_parts[-1].split('.')[0]\n",
    "    fn_num = int(fn.split('_')[-1])\n",
    "    \n",
    "    if fn_num > last_frame_num + 1 or fn_num < last_frame_num:\n",
    "        seq_count += 1  # new sequence\n",
    "        frame_in_seq = 1\n",
    "    last_frame_num = fn_num\n",
    "    \n",
    "    \n",
    "    images_labeled.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'labeled_{seq_count}',\n",
    "        'location': 'unknown',\n",
    "        'class': [clss],\n",
    "        'file': p,\n",
    "        'frame_num': frame_in_seq\n",
    "    })\n",
    "    frame_in_seq += 1\n",
    "    \n",
    "len(images_labeled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'ctenophorus scutulatus',\n",
       " 'reptile sp',\n",
       " 'tiliqua occipitalis',\n",
       " 'varanus gouldii',\n",
       " 'varanus scalaris',\n",
       " 'varanus sp',\n",
       " 'varanus tristis',\n",
       " 'varanus varius'}"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "species"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'awc_202103',\n",
       "  'seq_id': 'labeled_6',\n",
       "  'location': 'unknown',\n",
       "  'class': ['varanus sp'],\n",
       "  'file': 'MZT reptile pics/Varanus sp/13_IMG_0188.JPG',\n",
       "  'frame_num': 2},\n",
       " {'dataset': 'awc_202103',\n",
       "  'seq_id': 'labeled_6',\n",
       "  'location': 'unknown',\n",
       "  'class': ['varanus sp'],\n",
       "  'file': 'MZT reptile pics/Varanus sp/14_IMG_0189.JPG',\n",
       "  'frame_num': 3},\n",
       " {'dataset': 'awc_202103',\n",
       "  'seq_id': 'labeled_7',\n",
       "  'location': 'unknown',\n",
       "  'class': ['varanus sp'],\n",
       "  'file': 'MZT reptile pics/Varanus sp/15_IMG_0154.JPG',\n",
       "  'frame_num': 1},\n",
       " {'dataset': 'awc_202103',\n",
       "  'seq_id': 'labeled_8',\n",
       "  'location': 'unknown',\n",
       "  'class': ['varanus sp'],\n",
       "  'file': 'MZT reptile pics/Varanus sp/1_IMG_0070.JPG',\n",
       "  'frame_num': 1},\n",
       " {'dataset': 'awc_202103',\n",
       "  'seq_id': 'labeled_8',\n",
       "  'location': 'unknown',\n",
       "  'class': ['varanus sp'],\n",
       "  'file': 'MZT reptile pics/Varanus sp/2_IMG_0071.JPG',\n",
       "  'frame_num': 2}]"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "images_labeled[10:15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 712/712 [00:00<00:00, 747894.93it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset_name is set to awc_202103. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n",
      "Putting 712 images into sequences...\n",
      "Number of sequences: 172\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n",
      "\n",
      "all_img_properties\n",
      "{'location', 'class', 'file', 'dataset', 'frame_num'}\n",
      "\n",
      "img_level_properties\n",
      "{'file', 'frame_num'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'location', 'class', 'dataset'}\n",
      "\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "{\"dataset\": \"awc_202103\", \"seq_id\": \"labeled_1\", \"location\": \"unknown\", \"images\": [{\"file\": \"MZT reptile pics/Reptile sp/0_IMG_0066.JPG\", \"frame_num\": 1}], \"class\": [\"reptile sp\"]}\n",
      "\n",
      "{\"dataset\": \"awc_202103\", \"seq_id\": \"labeled_33\", \"location\": \"unknown\", \"images\": [{\"file\": \"MZT reptile pics/Varanus varius/6_IMG_0117.JPG\", \"frame_num\": 1}], \"class\": [\"varanus varius\"]}\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "sequences_labeled = process_sequences(images_labeled, dataset_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Unlabeled folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 494332/494332 [00:03<00:00, 147250.66it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "156"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "493620"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences_unlabeled = []\n",
    "locations = set()\n",
    "\n",
    "for p in tqdm(paths):\n",
    "    if p.startswith(labeled_folders):\n",
    "        continue\n",
    "    \n",
    "    p_parts = p.split('/')\n",
    "    \n",
    "    location = f'{p_parts[0]}_{p_parts[1]}'.replace(' ', '')\n",
    "    locations.add(location)\n",
    "    \n",
    "    # RCNX0007.JPG\n",
    "    im_id = p_parts[-1].split('.')[0].split('RCNX')[1]\n",
    "    \n",
    "    seq_id = f'dummy_{location}_{im_id}_{len(sequences_unlabeled)}'\n",
    "    \n",
    "    sequences_unlabeled.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': seq_id,\n",
    "        'location': location,\n",
    "        'class': ['__label_unavailable'],\n",
    "        'images': [{\n",
    "            'file': p,\n",
    "            'frame_num': 1\n",
    "        }]\n",
    "    })\n",
    "    \n",
    "len(locations)\n",
    "len(sequences_unlabeled)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combined "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "493792"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = sequences_labeled + sequences_unlabeled\n",
    "len(sequences)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 58.6 s, sys: 7.73 ms, total: 58.6 s\n",
      "Wall time: 58.6 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - sample unlabeled sequences and copy to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "156"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_sequences_unlabeled = sample(sequences_unlabeled, 100000)\n",
    "locations_in_sample = set()\n",
    "for seq in sample_sequences_unlabeled:\n",
    "    locations_in_sample.add(seq['location'])\n",
    "len(locations_in_sample)  # all 156 locations are represented"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100172"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 100172/100172 [00:00<00:00, 133857.04it/s]\n"
     ]
    }
   ],
   "source": [
    "seq_to_send = sequences_labeled + sample_sequences_unlabeled\n",
    "len(seq_to_send)\n",
    "\n",
    "path_pairs = []\n",
    "\n",
    "for seq in tqdm(seq_to_send):\n",
    "    seq_id = seq['seq_id']\n",
    "    \n",
    "    for im in seq['images']:\n",
    "        frame = im['frame_num']\n",
    "    \n",
    "        src_path = os.path.join(container_root, path_prefix, im['file'])\n",
    "        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', \n",
    "                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "        path_pairs.append((src_path, dst_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100712"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "[('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Reptile sp/0_IMG_0066.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_1.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Reptile sp/1_IMG_0383.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_2.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Reptile sp/2_IMG_0384.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_2.frame2.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus scalaris/0_IMG_0190.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_3.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus scalaris/1_IMG_0191.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_3.frame2.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus scalaris/2_IMG_0192.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_3.frame3.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus sp/0_IMG_0568.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_4.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus sp/10_IMG_0779.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_5.frame1.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus sp/11_IMG_0780.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_5.frame2.jpg'),\n",
       " ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus sp/12_IMG_0187.JPG',\n",
       "  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_6.frame1.jpg')]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "('/mink_disk_0/camtraps/awc/202103drop/Treatment_R1/FM80 - MG109/31.12.19/RCNX0378.JPG',\n",
       " '/mink_disk_0/camtraps/imerit12g/awc_202103.seqdummy_Treatment_R1_FM80-MG109_0378_489577.frame1.jpg')"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)\n",
    "path_pairs[:10]\n",
    "path_pairs[-100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 54s, sys: 3min 40s, total: 5min 34s\n",
      "Wall time: 6min 44s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "def copy_file(src_path, dst_path):\n",
    "    return copyfile(src_path, dst_path)\n",
    "\n",
    "with ThreadPool(12) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100712"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dst_paths)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
